knitr::opts_knit$set(root.dir = "/home/rstudio/NEOPLASY_PRIMATES")
knitr::opts_chunk$set(engine.path = list(
python = '/home/rstudio/NEOPLASY_PRIMATES/TreeCluster/venv/bin/python'))
getwd()
## [1] "/home/rstudio/NEOPLASY_PRIMATES/Scripts"

Directory definition

workingDir <- getwd()
dataDir <- file.path(workingDir, "Data")
resultsDir <- file.path(workingDir,"Out") 
getwd()
## [1] "/home/rstudio/NEOPLASY_PRIMATES"

Seed and library loading

set.seed(1998)

# General use libraries
library(ape)
library(RRphylo)
library(phytools)
library(reticulate)
library(dplyr)

# Non-concordant species retrieval 
library(stringdist)
library(taxize)
library(R.utils)

# Sort distances chunk
library(adephylo)
library(castor)

# Cluster merging
library(dplyr)

# Plotting
library(ggplot2)
library(tidyr)
library(dplyr)
library(viridis)
library(ggtree)
library(RColorBrewer)
library(svglite)

# From Alejandro
library(e1071)

# From Noelia
library(tidyverse)
library(ggpubr)
library(FactoMineR)
library(factoextra)
library(corrplot)
# library(GGally)
library(cowplot)
library(ape)
library(nlme)
# library(gt)
library(ggtree)
library(phytools)
library(RRphylo)
library(picante)
library(ggnewscale)
library(reshape)
library(palettetown)
library(compositions)
library(gt)

Tree import and prunning

primates_tree <- read.newick(file.path(dataDir,"233-GENOMES/science.abn7829_data_s4.nex.tree"))
tree_species <- primates_tree$tip.label
cancer_traits <- read.csv(file.path(dataDir, "Neoplasia/species360_primates_neoplasia_20230519.csv"), sep = ",", header = TRUE)

# Remove whitespaces
cancer_traits[] <- lapply(cancer_traits, function(col) trimws(col))

# binarize
cancer_traits$label <- gsub(" ", "_", cancer_traits$species)

# Reorder and filter species
cancer_traits <- cancer_traits %>%
  select(family,species,label,common_name,necropsy_count,neoplasia_necropsy,neoplasia_prevalence, everything()) %>%
  filter(!is.na(neoplasia_prevalence))
# Prune the tree as Noelia does

common_names <- intersect(primates_tree$tip.label, cancer_traits$label)

pruned_primates.tree <- drop.tip(primates_tree, setdiff(primates_tree$tip.label, common_names))

cancer_traits <- cancer_traits %>%
  mutate(ordering = match(label, pruned_primates.tree$tip.label)) %>%
  arrange(ordering)  %>%
  filter(!is.na(ordering))
# Outlier 

cancer_traits_filt <- cancer_traits %>%
  select(-common_name, -ordering, -species)

cancer_traits_filt <- cancer_traits_filt %>%
  mutate(across(c(necropsy_count, neoplasia_necropsy, neoplasia_prevalence), as.double))


means <- t(cancer_traits_filt[,-which(colnames(cancer_traits_filt) %in% c("family", "label"))] %>% 
             summarize_all(., mean))

medians <- t(cancer_traits_filt[,-which(colnames(cancer_traits_filt) %in% c("family", "label"))] %>% 
  summarize_all(., median))

sds <- t(cancer_traits_filt[,-which(colnames(cancer_traits_filt) %in% c("family", "label"))] %>% 
  summarize_all(., list(sd)))

# MELT the df (longize it)
melted_dataframe_traits <- melt(cancer_traits_filt, id.vars = c("family", "label"), measure.vars = c("necropsy_count", "neoplasia_necropsy", "neoplasia_prevalence"))

colnames(melted_dataframe_traits) <- c("Family", "Species", "Trait", "value")

# Add stats to the melted df
for (i in 1:length(melted_dataframe_traits$Species)){
  if(as.character(melted_dataframe_traits$Trait[i]) %in% rownames(medians)){
    melted_dataframe_traits$median[i] <- as.numeric(as.character(medians[rownames(medians) == melted_dataframe_traits$Trait[i]]))
    melted_dataframe_traits$mean[i] <- as.numeric(as.character(means[rownames(means) == melted_dataframe_traits$Trait[i]]))
    melted_dataframe_traits$sd[i] <- as.numeric(as.character(sds[rownames(sds) == melted_dataframe_traits$Trait[i]]))
  }}
calculate_stat <- function(stat_fn) {
  lapply(unique(cancer_traits_filt$family), function(fam) {
    subset_df <- cancer_traits_filt[cancer_traits_filt$family == fam, ]
    stats <- subset_df[,-which(colnames(subset_df) %in% c("label", "family"))] %>% 
               summarize_all(stat_fn)
    data.frame(family = fam, stats)
  })
}

fam_means_list <- calculate_stat(mean)
fam_medians_list <- calculate_stat(median)
fam_sds_list <- calculate_stat(sd)

bind_rows_df <- function(list_data) {
  do.call(rbind, list_data)
}

fam_means <- bind_rows_df(fam_means_list)
fam_medians <- bind_rows_df(fam_medians_list)
fam_sds <- bind_rows_df(fam_sds_list)

for (i in 1:length(melted_dataframe_traits$Species)){
  current_family <- melted_dataframe_traits$Family[i]
  current_trait <- as.character(melted_dataframe_traits$Trait[i])
  
  if(current_trait %in% colnames(fam_means)){
    melted_dataframe_traits$mean_fam[i] <- fam_means[fam_means$family == current_family, current_trait]
    melted_dataframe_traits$median_fam[i] <- fam_medians[fam_medians$family == current_family, current_trait]
    melted_dataframe_traits$sd_fam[i] <- fam_sds[fam_sds$family == current_family, current_trait]
  }
}
# Compute top bottom definitions
# global
melted_dataframe_traits <- melted_dataframe_traits %>%
  dplyr::group_by(Trait) %>% 
  mutate(label = case_when(
    value < median - sd ~ "low_extreme", 
    value > median + sd ~ "high_extreme",
    TRUE ~ "normal")) %>%
      mutate(outlier = case_when(
        value < quantile(value, 0.25,na.rm=TRUE) - 1.5 * IQR(value, na.rm=TRUE) ~ paste0("low_outlier_",Trait),
        value > quantile(value, 0.75,na.rm=TRUE) + 1.5 * IQR(value, na.rm=TRUE) ~ paste0("high_outlier_",Trait),
        TRUE ~ "no"))

# global 3sd calculation
melted_dataframe_traits <- melted_dataframe_traits %>%
  dplyr::group_by(Trait) %>% 
  mutate(label_2sd = case_when(
    value < median - 2*sd ~ "very_low_extreme", 
    value > median + 2*sd ~ "very_high_extreme",
    TRUE ~ "normal"))
# by family
melted_dataframe_traits <- melted_dataframe_traits %>%
  dplyr::group_by(Family, Trait) %>% 
  mutate(z_score_fam = (value-mean)/sd) %>%
  mutate(label_family = case_when(
    value < median_fam - sd_fam ~ "low_extreme", 
    value > median_fam + sd_fam ~ "high_extreme",
    TRUE ~ "normal"))

# family 2sd calculation
melted_dataframe_traits <- melted_dataframe_traits %>%
  dplyr::group_by(Family, Trait) %>%
  mutate(z_score_fam_2sd = (value-mean)/(2*sd)) %>%
  mutate(label_family_2sd = case_when(
    value < median_fam - 2*sd_fam ~ "very_low_extreme", 
    value > median_fam + 2*sd_fam ~ "very_high_extreme",
    TRUE ~ "normal"))

spread_df <- melted_dataframe_traits %>%
  spread(Trait, value)
# Filter dataframe for x and y values
df_x <- melted_dataframe_traits[melted_dataframe_traits$Trait == "necropsy_count", ]
df_y <- melted_dataframe_traits[melted_dataframe_traits$Trait == "neoplasia_necropsy", ]

# Merge them based on a common identifier (assuming you have an 'ID' column)
merged_df <- merge(df_x, df_y, by=c("Species","Family"))

# Map_plot
# Create a scatterplot
scatter_plot <- ggplot(data = merged_df, 
                      aes(x = value.x, y = value.y, color = Family)) + 
  geom_point(data = subset(merged_df, 
                      !(outlier.y %in% c("high_outlier_neoplasia_necropsy", "low_outlier_neoplasia_necropsy"))),
                      size = 4) +  
  # Plot outliers for y variable with a different shape
  geom_point(data = subset(merged_df, outlier.y %in% c("high_outlier_neoplasia_necropsy", "low_outlier_neoplasia_necropsy")),
             aes(shape = outlier.y), size = 4) +
  scale_shape_manual(values = c("high_outlier_neoplasia_necropsy" = 17, "low_outlier_neoplasia_necropsy" = 17)) +
  
  # Label species that are outliers for y variable
  geom_text(data = subset(merged_df, outlier.y %in% c("high_outlier_neoplasia_necropsy", "low_outlier_neoplasia_necropsy")),
            aes(label = Species), nudge_y = 1.5, size = 4, vjust = "right", hjust = "top") +
  
  labs(x = "Necropsy Count", y = "Neoplasia/Necropsy") +
  theme_minimal() +
  geom_smooth(method = "lm", se = TRUE, color="black") +
  theme(
    axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
    axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 10)),
    legend.position = "none")

print(scatter_plot)
## `geom_smooth()` using formula = 'y ~ x'

model <- lm(value.y ~ value.x, data = merged_df)
summary(model)
## 
## Call:
## lm(formula = value.y ~ value.x, data = merged_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -10.224  -3.196  -0.775   2.660  10.929 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.197603   0.997605  -0.198    0.844    
## value.x      0.078227   0.009278   8.432 3.09e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.746 on 51 degrees of freedom
## Multiple R-squared:  0.5823, Adjusted R-squared:  0.5741 
## F-statistic: 71.09 on 1 and 51 DF,  p-value: 3.093e-11

Top-to-bottom plots

Median - 1SD

df_z <- melted_dataframe_traits[melted_dataframe_traits$Trait == "neoplasia_prevalence", ]
df_z$median_fam <- factor(df_z$median_fam, levels = unique(df_z$median_fam))

# With outliers
ggplot(data = df_z, 
       aes(x = value, 
           y = Family)) +
  geom_point(data = subset(df_z, 
                        !(label_family %in% c("low_extreme", "normal", "high_extreme"))),
                        size = 4) +
  geom_point(data = subset(df_z, label_family %in% c("low_extreme", "normal", "high_extreme")),
               aes(shape = label_family, color = Family), size = 4) +
  scale_shape_manual(values = c("low_extreme" = 22, "normal" = 21, "high_extreme" = 24)) +
  stat_summary(fun.x = median, geom = "errorbar",
        aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
  geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
    labs(x = "Neoplasia Prevalence", y = "Families") +
  theme(
    axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
    axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
    legend.position = "none") + 
  xlim(0,0.5) 
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## Warning: The dot-dot notation (`..x..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(x)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## No summary function supplied, defaulting to `mean_se()`

Median - 1SD without outliers

# Without outliers

df_z_nooutlier <- df_z[df_z$outlier == "no", ]

ggplot(data = df_z_nooutlier, 
       aes(x = value, 
           y = Family)) +
  geom_point(data = subset(df_z_nooutlier, 
                        !(label_family %in% c("low_extreme", "normal", "high_extreme"))),
                        size = 4) +
  geom_point(data = subset(df_z_nooutlier, label_family %in% c("low_extreme", "normal", "high_extreme")),
               aes(shape = label_family, color = Family), size = 4) +
  scale_shape_manual(values = c("low_extreme" = 22, "normal" = 21, "high_extreme" = 24)) +
  stat_summary(fun.x = median, geom = "errorbar",
        aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
  geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
    labs(x = "Neoplasia Prevalence", y = "Families") +
  theme(
    axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
    axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
    legend.position = "none") + 
  xlim(0,0.25) 
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## No summary function supplied, defaulting to `mean_se()`

Median - 2SD

ggplot(data = df_z, 
       aes(x = value, 
           y = Family)) +
  geom_point(data = subset(df_z, 
                        !(label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme"))),
                        size = 4) +
  geom_point(data = subset(df_z, label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme")),
               aes(shape = label_family_2sd, color = Family), size = 4) +
  scale_shape_manual(values = c("very_low_extreme" = 22, "normal" = 21, "very_high_extreme" = 24)) +
  stat_summary(fun.x = median, geom = "errorbar",
        aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
  geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
    labs(x = "Neoplasia Prevalence", y = "Families") +
  theme(
    axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
    axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
    legend.position = "none") + 
  xlim(0,0.5) 
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## No summary function supplied, defaulting to `mean_se()`

Median - 2SD without outliers

ggplot(data = df_z_nooutlier, 
       aes(x = value, 
           y = Family)) +
  geom_point(data = subset(df_z_nooutlier, 
                        !(label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme"))),
                        size = 4) +
  geom_point(data = subset(df_z_nooutlier, label_family_2sd %in% c("very_low_extreme", "normal", "very_high_extreme")),
               aes(shape = label_family_2sd, color = Family), size = 4) +
  scale_shape_manual(values = c("very_low_extreme" = 22, "normal" = 21, "very_high_extreme" = 24)) +
  stat_summary(fun.x = median, geom = "errorbar",
        aes(xmax = ..x.., xmin = ..x..), linewidth = 1, color = pokepal("rayquaza", spread = 1)) +
  geom_vline(aes(xintercept = median), linetype = "dotted", color = pokepal("rayquaza", spread = 1)) +
    labs(x = "Neoplasia Prevalence", y = "Families") +
  theme(
    axis.title.x = element_text(size = 10, hjust = 0.5, margin = margin(t = 20, b = 10)),
    axis.title.y = element_text(size = 10, angle = 90, hjust = 0.5, margin = margin(t = 20, b = 20)),
    legend.position = "none") + 
  xlim(0,0.25) 
## Warning in stat_summary(fun.x = median, geom = "errorbar", aes(xmax = ..x.., :
## Ignoring unknown parameters: `fun.x`
## No summary function supplied, defaulting to `mean_se()`

Histogram visualization for the three traits

hist(cancer_traits_filt$necropsy_count, xlab = "Necropsy Count", main = "Histogram of primates Necropsy Count")

hist(cancer_traits_filt$neoplasia_necropsy, xlab = "Neoplasia Count", main = "Histogram of primates Neoplasia Count")

hist(cancer_traits_filt$neoplasia_prevalence, xlab = "Neoplasia prevalence", main = "Histogram of primates Neoplasia Prevalence")

Visual distribution of the three traits along the phylogeny

Outlier highlight

tree_species <- pruned_primates.tree$tip.label

NC <- cancer_traits_filt$necropsy_count
names(NC) <- tree_species
NC <- NC[!is.na(names(NC))]

NN <- cancer_traits_filt$neoplasia_necropsy
names(NN) <- tree_species
NN <- NN[!is.na(names(NN))]

NP <- cancer_traits_filt$neoplasia_prevalence
names(NP) <- tree_species
NP <- NP[!is.na(names(NP))]

# Necropsy Count
obj_nc<-contMap(pruned_primates.tree,NC,plot=FALSE)
obj_nc<-setMap(obj_nc,c("#077223", "#53EC2D", "white", "#F1F97C", 
                  "#F08431", "#F03131"))

plot(obj_nc,fsize=c(1),lwd=c(5),leg.txt="Necropsy Count")

# Neoplasia Count
obj_nn<-contMap(pruned_primates.tree,NN,plot=FALSE)
obj_nn<-setMap(obj_nn,c("#077223", "#53EC2D", "white", "#F1F97C", 
                  "#F08431", "#F03131"))
plot(obj_nn,fsize=c(1),lwd=c(5),leg.txt="Neoplasia Count")

# Neoplasia_prevalence
obj_np<-contMap(pruned_primates.tree,NP,plot=FALSE)
obj_np<-setMap(obj_np,c("#077223", "#53EC2D", "white", "#F1F97C", 
                  "#F08431", "#F03131"))
plot(obj_np,fsize=c(1),lwd=c(5),leg.txt="Neoplasia Prevalence")

Scatterplots

ggplot(cancer_traits_filt, aes(x=necropsy_count, y=neoplasia_necropsy, color=family))+ 
  geom_point() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  
  labs(y = "Neoplasia count", x = "Necropsy count", fill = "Neoplasia/Necropsy") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

ggplot(cancer_traits_filt, aes(x=normalize(necropsy_count), y=normalize(neoplasia_necropsy), color=family))+ 
  geom_point() +
  geom_smooth(method=lm , color="red", fill="#69b3a2", se=TRUE) +
  labs(y = "Neoplasia count", x = "Necropsy count", fill = "Neoplasia/Necropsy") +
  theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'

Summary table

# Clean and Summarize the Data
summary_data <- cancer_traits_filt %>%
  group_by(family) %>%
  # Summarize data
  summarize(
    n = n(),
    NC_Mean = mean(necropsy_count, na.rm = TRUE),
    NC_Median = median(necropsy_count, na.rm = TRUE),
    NC_SD = sd(necropsy_count, na.rm = TRUE),
    min_NC = min(necropsy_count, na.rm = TRUE),
    max_NC = max(necropsy_count, na.rm = TRUE),
    
    NN_Mean = mean(neoplasia_necropsy, na.rm = TRUE),
    NN_Median = median(neoplasia_necropsy, na.rm = TRUE),
    NN_SD = sd(neoplasia_necropsy, na.rm = TRUE),
    min_NN = min(neoplasia_necropsy, na.rm = TRUE),
    max_NN = max(neoplasia_necropsy, na.rm = TRUE),
    
    NP_Mean = mean(neoplasia_prevalence, na.rm = TRUE),
    NP_Median = median(neoplasia_prevalence, na.rm = TRUE),
    NP_SD = sd(neoplasia_prevalence, na.rm = TRUE),
    min_NP = min(neoplasia_prevalence, na.rm = TRUE),
    max_NP = max(neoplasia_prevalence, na.rm = TRUE))  %>% 
  
  # Compute the MLS Limits
  mutate(NC_Limits = paste(min_NC, "-", max_NC),
         NN_Limits = paste(min_NN, "-",max_NN),
         NP_Limits = paste(min_NP, "-",max_NP))

summary_data
## # A tibble: 12 × 20
##    family       n NC_Mean NC_Median  NC_SD min_NC max_NC NN_Mean NN_Median NN_SD
##    <chr>    <int>   <dbl>     <dbl>  <dbl>  <dbl>  <dbl>   <dbl>     <dbl> <dbl>
##  1 Aotidae      1    26        26    NA        26     26    0          0   NA   
##  2 Atelidae     2    34.5      34.5   7.78     29     40    8.5        8.5  6.36
##  3 Callitr…    14   112.       80.5  74.8      41    309    5.57       2    7.84
##  4 Cebidae      2   115       115   109.       38    192    7          7    5.66
##  5 Cercopi…    16    64.8      56    39.4      21    158    2.88       2    2.66
##  6 Cheirog…     2    44        44    24.0      27     61    8.5        8.5  7.78
##  7 Galagid…     2    65        65    26.9      46     84    2          2    1.41
##  8 Hominid…     4    65        61.5  44.7      24    113    8.5        8.5  7.59
##  9 Hylobat…     1    58        58    NA        58     58    7          7   NA   
## 10 Indriid…     1    50        50    NA        50     50    4          4   NA   
## 11 Lemurid…     6   115.       57.5 144.       28    399   15.2       10   11.7 
## 12 Lorisid…     2    52        52    35.4      27     77    7.5        7.5  7.78
## # ℹ 10 more variables: min_NN <dbl>, max_NN <dbl>, NP_Mean <dbl>,
## #   NP_Median <dbl>, NP_SD <dbl>, min_NP <dbl>, max_NP <dbl>, NC_Limits <chr>,
## #   NN_Limits <chr>, NP_Limits <chr>
# Use the gt function to create the table
table_display <- summary_data %>%
  select(family,n,NC_Mean,NC_Median,NC_SD,NC_Limits,
         NN_Mean,NN_Median,NN_SD,NN_Limits,
         NP_Mean,NP_Median,NP_SD,NP_Limits) %>%
  gt() %>%
  
  tab_header(title = "Table. Mean NC, NN and NP Computed Using All Records from our Primate Database") %>%
  
  # Formatting columns for better display
  fmt_number(columns = c("NC_Mean", "NC_Median", "NC_SD", "NN_Mean", "NN_Median", "NN_SD", "NP_Mean", "NP_Median", "NP_SD"), decimals = 2) %>%
  
  # Adding and removing table lines
  tab_options(
    table.border.top.style = "none",  # Remove the top border
    table.border.bottom.style = "solid",
    table.border.bottom.width = 2,
    column_labels.border.top.style = "double",
    column_labels.border.bottom.style = "double")

# Display the table
table_display
Table. Mean NC, NN and NP Computed Using All Records from our Primate Database
family n NC_Mean NC_Median NC_SD NC_Limits NN_Mean NN_Median NN_SD NN_Limits NP_Mean NP_Median NP_SD NP_Limits
Aotidae 1 26.00 26.00 NA 26 - 26 0.00 0.00 NA 0 - 0 0.00 0.00 NA 0 - 0
Atelidae 2 34.50 34.50 7.78 29 - 40 8.50 8.50 6.36 4 - 13 0.27 0.27 0.25 0.1 - 0.448275862068966
Callitrichidae 14 112.50 80.50 74.75 41 - 309 5.57 2.00 7.84 0 - 27 0.04 0.03 0.04 0 - 0.105590062111801
Cebidae 2 115.00 115.00 108.89 38 - 192 7.00 7.00 5.66 3 - 11 0.07 0.07 0.02 0.0572916666666667 - 0.0789473684210526
Cercopithecidae 16 64.75 56.00 39.41 21 - 158 2.88 2.00 2.66 0 - 10 0.04 0.04 0.03 0 - 0.106382978723404
Cheirogaleidae 2 44.00 44.00 24.04 27 - 61 8.50 8.50 7.78 3 - 14 0.17 0.17 0.08 0.111111111111111 - 0.229508196721311
Galagidae 2 65.00 65.00 26.87 46 - 84 2.00 2.00 1.41 1 - 3 0.04 0.04 0.04 0.0119047619047619 - 0.0652173913043478
Hominidae 4 65.00 61.50 44.70 24 - 113 8.50 8.50 7.59 1 - 16 0.11 0.13 0.05 0.0333333333333333 - 0.150537634408602
Hylobatidae 1 58.00 58.00 NA 58 - 58 7.00 7.00 NA 7 - 7 0.12 0.12 NA 0.120689655172414 - 0.120689655172414
Indriidae 1 50.00 50.00 NA 50 - 50 4.00 4.00 NA 4 - 4 0.08 0.08 NA 0.08 - 0.08
Lemuridae 6 114.67 57.50 143.66 28 - 399 15.17 10.00 11.70 7 - 38 0.20 0.18 0.09 0.0952380952380952 - 0.321428571428571
Lorisidae 2 52.00 52.00 35.36 27 - 77 7.50 7.50 7.78 2 - 13 0.12 0.12 0.07 0.0740740740740741 - 0.168831168831169
#gtsave(table_display, filename = file.path(resultsDir,"descriptive/primates_table.png"))
top1SD_spp <- df_z_nooutlier %>%
  filter(label_family == "high_extreme") %>%
  select(label_family,Species,value)
## Adding missing grouping variables: `Family`, `Trait`
bottom1SD_spp <- df_z_nooutlier %>%
  filter(label_family == "low_extreme") %>%
  select(label_family,Species,value)
## Adding missing grouping variables: `Family`, `Trait`
# family_names <- unique(cancer_traits_filt$family)
# 
# get_uid <- function(family_name) {
#   # Try fetching UID for species
#   uid <- tryCatch({
#     ggimage::phylopic_uid(family_name)
#   }, error = function(e) {
#     return(NULL) # return NULL on error
#   })
#   return(uid)
# }
# 
# for uid in 
# 
# fam_uids <- sapply(family_names, get_uid)
# 
# header_images <- paste0('<img src="', primate_images, '" height="50px">') %>%
#   paste(collapse = " ")
# 
# # Load the ggplot2 library
# library(ggplot2)
# 
# # Combine the data
# combined_data <- rbind(
#   transform(top1SD_spp, category = "Top Species"),
#   transform(bottom1SD_spp, category = "Bottom Species")
# )
# 
# combined_data$Species <- gsub("_", " ", combined_data$Species)
# 
# # Load necessary libraries
# library(gt)
# 
# combined_data %>%
#   gt() %>%
#   tab_header(title = “PRIMATES NEOPLASY ANALSYS”) %>%
#   cols_label(
#     Species = 'Species',
#     neoplasia_prevalence = 'Neoplasia Prevalence') %>%
#   tab_spanner(
#     label = md('**Top Species**') %>% 
#   tab_style(
#     style = list(cell_fill(color = “#b2f7ef”),
#     cell_text(weight = “bold”)),
#     locations = cells_body(columns = mpg))%>%
#   tab_style(
#     style = list(cell_fill(color = “#ffefb5”),
#     cell_text(weight = “bold”)), 
#     locations = cells_body(columns = hp))

Continous variable analysis

library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
## 
## Attaching package: 'GGally'
## The following object is masked from 'package:R.utils':
## 
##     wrap
## The following object is masked from 'package:emmeans':
## 
##     pigs
# Check continuous variables distribution
p <- cancer_traits_filt %>% 
  select(necropsy_count, neoplasia_necropsy, neoplasia_prevalence) %>%
  ggpairs()

# Select 2 variables, remove missing variables and calculate spearman correlation from them
cor_numer <- cancer_traits_filt %>% 
  select(necropsy_count, neoplasia_necropsy, neoplasia_prevalence) %>% # select_if(is.numeric) 
  drop_na() %>%
  cor(., method = "spearman") #,use = "pairwise.complete.obs" )

#  Correlation plot
corrplot.mixed(cor_numer, number.digits = 2, tl.col = "black") # Set the color of text labels for correlation numbers